/**********************************************************************
gc_3_create_individual_panel.do

**********************************************************************/
**********
* SET UP *
**********
clear all
set matsize 2000
set more 1

* location for dofiles *
cd "T:\_Projet_4915\dofiles"
global dir "`c(pwd)'"
cd $dir

*********
* GATES *
*********
* Specify which data you want to work with (synthetic = syn, real = rl) *
local ext = "rl"

* Specify which cross-sections you want to process *
local start_year = 2001
local end_year = 2012

* STEP 1: make early sample restrictions *
local gate1 = 1

* STEP 2: append all cross-sections and create panel dataset *
local gate2 = 1

* STEP 3: make sample restrictions and clean a little bit *
local gate3 = 1

* STEP 4: delete all intermediate datasets *
local gate4 = 1

* start log file *
quietly capture log close
quietly log using gc_3_create_individual_panel_`ext', text replace

* specify file locations *
global project_folder "\_Projet_4915"
global data_folder "\_Projet_4915\DATA"
global output_folder "\_Projet_4915\ResultsFolder"
global temp "temp"

local datadir T:\${data_folder}\

*******************************************
* MAKE A GLOBAL VARIABLE FOR TODAY'S DATE *
*******************************************
local tyr = substr("$S_DATE",8,4)
local tmo = substr("$S_DATE",4,3)
local tmd = trim(substr("$S_DATE",1,2))

* make day of month two digits *
local wl = length("`tmd'")
if `wl'==1 {
local tmd2 ="0"+"`tmd'"
}
if `wl'!=1 {
local tmd2 ="`tmd'"
}

* get numeric month, make it two digits *
local di="`tmd2'"+"`tmo'"+"`tyr'"
local edate = date("`di'", "DMY")
local mono = month(`edate')
local ml = length("`mono'")
if `ml'==1 {
local mono2 ="0"+"`mono'"
}
if `ml'!=1 {
local mono2 ="`mono'"
}

* put final date together *
global date = "`tyr'"+"`mono2'"+"`tmd2'"

****************************************
* DEFINE LITTLE PROGRAMS TO PRINT TIME *
****************************************
program define starttime
	display "Started processing at $S_TIME on $S_DATE"
end

program define endtime
	display "Finished processing at $S_TIME on $S_DATE"
end

************************
* START OF THE PROGRAM *
************************

******************************************
* STEP 1: make early sample restrictions *
******************************************
disp "***** Started processing STEP 1 *****"
starttime
if 1 == `gate1' ///
{
	disp "***** STEP 1: make early sample restrictions *****"
	forvalues y = `start_year'(1)`end_year' ///
	{
		use "`datadir'gc_cross_section_`y'_`ext'_w_cities.dta", clear
		
		****************************************************
		* merge longitudinal firm identifier by eid tax_yr *
		****************************************************
		rename eid_Ind eid
		merge m:1 eid tax_yr using "`datadir'gc_longitudinal_eid_`ext'.dta"
		drop if _merge == 2
		
		********************************************************
		* early sample restriction: keep only people age 20-55 *
		********************************************************
		drop if t1h_dv_age > 55 & t1h_dv_age != .
		drop if t1h_dv_age < 20 & t1h_dv_age != .

		forvalues j = 1(1)2 ///
		{
			preserve
			**************************************************************
			* early sample restriction: keep only one gender per dataset *
			**************************************************************
			if `j' == 1 ///
			{
				keep if t1h_sex == 1
				local gender "men"
			}
			if `j' == 2 ///
			{
				keep if t1h_sex == 2
				local gender "women"
			}
			
			* keep key variables *
			keep pid eid* tax_yr ///
			t1h_sex t1h_dv_age t1h_marst inimdb res* ///
			t1h_earn_t4* t1h_tot_inc_calc* emp_inc* tot_emp_inc* t1h_bus_inc_gross* t1h_bus_inc_net* tot_busi_t4* tot_busi_3680* ///
			y*

			* order key variables *
			order pid eid* tax_yr ///
			t1h_sex t1h_dv_age t1h_marst inimdb res* ///
			t1h_earn_t4* t1h_tot_inc_calc* emp_inc* tot_emp_inc* t1h_bus_inc_gross* t1h_bus_inc_net* tot_busi_t4* tot_busi_3680* ///
			y*
			

			*******************
			* CLEAN SOME MORE *
			*******************
			* save intermediate dataset to the data_folder *
			compress
			save "`datadir'gc_cross_section_`gender'_`y'_`ext'.dta", replace
			restore
		}
		clear
		
		* delete large intermediate datasets as I go *
		capture erase "`datadir'gc_cross_section_`y'_`ext'_w_cities.dta"
	}
	disp "***** STEP 1: make early sample restrictions (COMPLETED) *****"
}
disp "***** Finished processing STEP 1 *****"
endtime


**************************************************************
* STEP 2: append all cross-sections and create panel dataset *
**************************************************************
disp "***** Started processing STEP 2 *****"
starttime
if 1 == `gate2' ///
{
	disp "***** STEP 2: append all cross-sections and create panel dataset *****"
	forvalues j = 1(1)2 ///
	{
		if `j' == 1 ///
		{
			local gender "men"
		}
		if `j' == 2 ///
		{
			local gender "women"
		}
		
		use "`datadir'gc_cross_section_`gender'_2001_`ext'.dta", clear
		
		forvalues y = 2002(1)`end_year' ///
		{
			append using "`datadir'gc_cross_section_`gender'_`y'_`ext'.dta"
		}

		* save final dataset to the data_folder *
		sort pid tax_yr
		compress
		save "`datadir'gc_analyze_me_`gender'_`ext'.dta", replace
		clear
		
		* delete large intermediate datasets as I go *
		forvalues y = `start_year'(1)`end_year' ///
		{
			capture erase "`datadir'gc_cross_section_`gender'_`y'_`ext'.dta"
		}
	}
	disp "***** STEP 2: append all cross-sections and create panel dataset (COMPLETED) *****"
}
disp "***** Finished processing STEP 2 *****"
endtime

***********************************************************
* STEP 3: make sample restrictions and clean a little bit *
***********************************************************
disp "***** Started processing STEP 3 *****"
starttime
if 1 == `gate3' ///
{
	disp "***** STEP 3: make sample restrictions and clean a little bit *****"
	
	forvalues j = 1(1)2 ///
	{
		if `j' == 1 ///
		{
			local gender "men"
		}
		
		if `j' == 2 ///
		{
			local gender "women"
		}
	
		use "`datadir'gc_analyze_me_`gender'_`ext'.dta", clear
	
		*****************************************************************************************
		* fill in the blanks for some individual characteristics using responses in other years *
		*****************************************************************************************
		sort pid tax_yr
		
		* age *
		gen dob_yr_tmp = tax_yr - t1h_dv_age
		by pid: egen dob_yr = max(dob_yr_tmp)
		gen age = tax_yr - dob_yr
		
		* drop individuals with missing information on age *
		drop if missing(age)
		drop dob_yr_tmp
	
		* sex *
		by pid: egen sex = max(t1h_sex)
	
		* drop individuals with missing information on sex *
		drop if missing(sex)
		
		if `j' == 1 ///
		{
			keep if sex == 1
		}

		if `j' == 2 ///
		{
			keep if sex == 2
		}
		
		******************************
		* keep only people age 20-55 *
		******************************
		drop if age < 20
		drop if age > 55
	
		* display the number of individuals *
		bys pid: gen counter = _n
		count if counter == 1
		display "There are now `r(N)' individuals in the `gender' individual dataset"
		drop counter
		
		******************************************************************
		* convert all nominal amounts into constant 2012 dollars         *
		* CPI is from Bank of Canada's core index: CANSIM Table 326-0021 *
		******************************************************************
		foreach var in ///
		"t1h_earn_t4" "t1h_tot_inc_calc" "emp_inc" "tot_emp_inc" "t1h_bus_inc_gross" "t1h_bus_inc_net" "tot_busi_t4" "tot_busi_3680" "tot_busi_t4_startups" "tot_busi_3680_startups" ///
		"y_0" "y_1" "y_2" "y_3_all" "y_3_startups" "res_pc_inc" ///
		{
			replace `var' = `var'*1.223132037 if tax_yr == 2001
			replace `var' = `var'*1.195 if tax_yr == 2002
			replace `var' = `var'*1.16927593 if tax_yr == 2003
			replace `var' = `var'*1.151252408 if tax_yr == 2004
			replace `var' = `var'*1.132701422 if tax_yr == 2005
			replace `var' = `var'*1.111627907 if tax_yr == 2006
			replace `var' = `var'*1.088342441 if tax_yr == 2007
			replace `var' = `var'*1.069829902 if tax_yr == 2008
			replace `var' = `var'*1.05193662 if tax_yr == 2009
			replace `var' = `var'*1.033737024 if tax_yr == 2010
			replace `var' = `var'*1.017021277 if tax_yr == 2011
			replace `var' = `var'*1 if tax_yr == 2012
		}
		replace res_pc_inc = . if res_pc == ""
		replace res_pc_pop = . if res_pc == ""
		replace res_pc_busipop = . if res_pc == ""
		replace res_pc_shr_bus = . if res_pc == ""
		
		************
		* CLEAN UP *
		************
		* keep key variables *
		keep pid eid* tax_yr ///
		sex dob_yr age t1h_marst inimdb res* ///
		t1h_earn_t4* t1h_tot_inc_calc* emp_inc* tot_emp_inc* t1h_bus_inc_gross* t1h_bus_inc_net* tot_busi_t4* tot_busi_3680* ///
		y*

		* order key variables *
		order pid eid* tax_yr ///
		sex dob_yr age t1h_marst inimdb res* ///
		t1h_earn_t4* t1h_tot_inc_calc* emp_inc* tot_emp_inc* t1h_bus_inc_gross* t1h_bus_inc_net* tot_busi_t4* tot_busi_3680* ///
		y*

		* save intermediate dataset to the data_folder *
		compress
		save "`datadir'gc_individual_panel_`gender'_`ext'.dta", replace
		clear

		* delete all intermediate datasets *
		capture erase "`datadir'gc_analyze_me_`gender'_`ext'.dta"
	}
	
	disp "***** STEP 3: make sample restrictions (COMPLETED)*****"
}
disp "***** Finished processing STEP 3 *****"
endtime

********************************************
* STEP 4: delete all intermediate datasets *
********************************************
disp "***** Started processing STEP 3 *****"
starttime
if 1 == `gate4' ///
{
	disp "***** STEP 4: delete all intermediate datasets *****"
	
	disp "***** STEP 4: delete all intermediate datasets (COMPLETED) *****"
}
disp "***** Finished processing STEP 4 *****"
endtime

*********************
* CLEAN UP AND EXIT *
*********************
clear all
log close
